{janitor}包清理变量名称

packages
Author

Tony Duan

Published

October 4, 2022

使用{janitor}包清理变量名称

读入数据

可见nurses数据集里的变量名非常凌乱

Code
nurses <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-10-05/nurses.csv')
names(nurses)
 [1] "State"                                          
 [2] "Year"                                           
 [3] "Total Employed RN"                              
 [4] "Employed Standard Error (%)"                    
 [5] "Hourly Wage Avg"                                
 [6] "Hourly Wage Median"                             
 [7] "Annual Salary Avg"                              
 [8] "Annual Salary Median"                           
 [9] "Wage/Salary standard error (%)"                 
[10] "Hourly 10th Percentile"                         
[11] "Hourly 25th Percentile"                         
[12] "Hourly 75th Percentile"                         
[13] "Hourly 90th Percentile"                         
[14] "Annual 10th Percentile"                         
[15] "Annual 25th Percentile"                         
[16] "Annual 75th Percentile"                         
[17] "Annual 90th Percentile"                         
[18] "Location Quotient"                              
[19] "Total Employed (National)_Aggregate"            
[20] "Total Employed (Healthcare, National)_Aggregate"
[21] "Total Employed (Healthcare, State)_Aggregate"   
[22] "Yearly Total Employed (State)_Aggregate"        

清理变量名称

Code
library(janitor)
library(tidyverse) 

整理后,变量名变得更加易读。

Code
library(janitor)
nurses_clean=nurses %>% clean_names
names(nurses_clean)
 [1] "state"                                       
 [2] "year"                                        
 [3] "total_employed_rn"                           
 [4] "employed_standard_error_percent"             
 [5] "hourly_wage_avg"                             
 [6] "hourly_wage_median"                          
 [7] "annual_salary_avg"                           
 [8] "annual_salary_median"                        
 [9] "wage_salary_standard_error_percent"          
[10] "hourly_10th_percentile"                      
[11] "hourly_25th_percentile"                      
[12] "hourly_75th_percentile"                      
[13] "hourly_90th_percentile"                      
[14] "annual_10th_percentile"                      
[15] "annual_25th_percentile"                      
[16] "annual_75th_percentile"                      
[17] "annual_90th_percentile"                      
[18] "location_quotient"                           
[19] "total_employed_national_aggregate"           
[20] "total_employed_healthcare_national_aggregate"
[21] "total_employed_healthcare_state_aggregate"   
[22] "yearly_total_employed_state_aggregate"       

清理之前的数据,需要删掉第一行,清理变量名,删除空列,删除都是相同值的列。

Code
#xl_file_0 <- readxl::read_excel('https://raw.githubusercontent.com/sfirke/janitor/main/dirty_data.xlsx', skip = 1)

#不读第一行
xl_file_0 <- readxl::read_excel('dirty_data.xlsx', skip = 1)
  
xl_file=xl_file_0 %>% 
  
  clean_names() %>%
  remove_empty() %>% 
  remove_constant()
xl_file
# A tibble: 12 × 9
   first_name   last_name employee_status subject    hire_date percent_allocated
   <chr>        <chr>     <chr>           <chr>          <dbl>             <dbl>
 1 Jason        Bourne    Teacher         PE             39690              0.75
 2 Jason        Bourne    Teacher         Drafting       43479              0.25
 3 Alicia       Keys      Teacher         Music          37118              1   
 4 Ada          Lovelace  Teacher         <NA>           38572              1   
 5 Desus        Nice      Administration  Dean           42791              1   
 6 Chien-Shiung Wu        Teacher         Physics        11037              0.5 
 7 Chien-Shiung Wu        Teacher         Chemistry      11037              0.5 
 8 James        Joyce     Teacher         English        36423              0.5 
 9 Hedy         Lamarr    Teacher         Science        27919              0.5 
10 Carlos       Boozer    Coach           Basketball     42221             NA   
11 Young        Boozer    Coach           <NA>           34700             NA   
12 Micheal      Larsen    Teacher         English        40071              0.8 
# … with 3 more variables: full_time <chr>, certification_9 <chr>,
#   certification_10 <chr>

数值型日期变为日期型

Code
xl_file %>% 
  mutate(hire_date = excel_numeric_to_date(hire_date))
# A tibble: 12 × 9
   first_name   last_name employee_status subject    hire_date  percent_allocat…
   <chr>        <chr>     <chr>           <chr>      <date>                <dbl>
 1 Jason        Bourne    Teacher         PE         2008-08-30             0.75
 2 Jason        Bourne    Teacher         Drafting   2019-01-14             0.25
 3 Alicia       Keys      Teacher         Music      2001-08-15             1   
 4 Ada          Lovelace  Teacher         <NA>       2005-08-08             1   
 5 Desus        Nice      Administration  Dean       2017-02-25             1   
 6 Chien-Shiung Wu        Teacher         Physics    1930-03-20             0.5 
 7 Chien-Shiung Wu        Teacher         Chemistry  1930-03-20             0.5 
 8 James        Joyce     Teacher         English    1999-09-20             0.5 
 9 Hedy         Lamarr    Teacher         Science    1976-06-08             0.5 
10 Carlos       Boozer    Coach           Basketball 2015-08-05            NA   
11 Young        Boozer    Coach           <NA>       1995-01-01            NA   
12 Micheal      Larsen    Teacher         English    2009-09-15             0.8 
# … with 3 more variables: full_time <chr>, certification_9 <chr>,
#   certification_10 <chr>

四舍五入

R里的四舍五入是入到双数

Code
round(seq(0.5, 4.5, 1))
[1] 0 2 2 4 4

可以用{janitor}包里的round_half_up做到四舍五入

Code
round_half_up(seq(0.5, 4.5, 1))
[1] 1 2 3 4 5

找相同行

使用get_dupes找相同行,并自动加上相同的观察数dupe_count 。

Code
starwars %>% 
  get_dupes(homeworld) %>% 
  select(1:5)
# A tibble: 48 × 5
   homeworld dupe_count name                height  mass
   <chr>          <int> <chr>                <int> <dbl>
 1 Alderaan           3 Leia Organa            150  49  
 2 Alderaan           3 Bail Prestor Organa    191  NA  
 3 Alderaan           3 Raymus Antilles        188  79  
 4 Corellia           2 Han Solo               180  80  
 5 Corellia           2 Wedge Antilles         170  77  
 6 Coruscant          3 Finis Valorum          170  NA  
 7 Coruscant          3 Adi Gallia             184  50  
 8 Coruscant          3 Jocasta Nu             167  NA  
 9 Kamino             3 Boba Fett              183  78.2
10 Kamino             3 Lama Su                229  88  
# … with 38 more rows

也可以同时找多个变量的相同行。

Code
starwars %>% 
  get_dupes(homeworld,eye_color) %>% 
  select(1:5)
# A tibble: 32 × 5
   homeworld eye_color dupe_count name                height
   <chr>     <chr>          <int> <chr>                <int>
 1 Alderaan  brown              3 Leia Organa            150
 2 Alderaan  brown              3 Bail Prestor Organa    191
 3 Alderaan  brown              3 Raymus Antilles        188
 4 Coruscant blue               3 Finis Valorum          170
 5 Coruscant blue               3 Adi Gallia             184
 6 Coruscant blue               3 Jocasta Nu             167
 7 Kamino    black              2 Lama Su                229
 8 Kamino    black              2 Taun We                213
 9 Kashyyyk  blue               2 Chewbacca              228
10 Kashyyyk  blue               2 Tarfful                234
# … with 22 more rows

(table)数变量的分布

Code
table(mpg$manufacturer)

      audi  chevrolet      dodge       ford      honda    hyundai       jeep 
        18         19         37         25          9         14          8 
land rover    lincoln    mercury     nissan    pontiac     subaru     toyota 
         4          3          4         13          5         14         34 
volkswagen 
        27 

(tabyl)数变量的分布,不仅有有数量还有占比。

Code
tabyl(mpg,manufacturer)
 manufacturer  n    percent
         audi 18 0.07692308
    chevrolet 19 0.08119658
        dodge 37 0.15811966
         ford 25 0.10683761
        honda  9 0.03846154
      hyundai 14 0.05982906
         jeep  8 0.03418803
   land rover  4 0.01709402
      lincoln  3 0.01282051
      mercury  4 0.01709402
       nissan 13 0.05555556
      pontiac  5 0.02136752
       subaru 14 0.05982906
       toyota 34 0.14529915
   volkswagen 27 0.11538462

Reference

{janitor} by Sam Firke

document by Albert Rapp: https://albert-rapp.de/posts/07_janitor_showcase/07_janitor_showcase.html https://www.youtube.com/watch?v=AKPvlNWZBEQ